package cnki.kg.demo.entity;

import cnki.kg.demo.util.StringUtil;
import org.springframework.jdbc.core.JdbcTemplate;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.HashMap;
import java.util.List;
import java.util.stream.Collectors;

public class CnkiDetailRepoPageProcessor implements PageProcessor {
    // 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(2000);

    public String getFileName() {
        return fileName;
    }

    public void setFileName(String fileName) {
        this.fileName = fileName;
    }

    private  String fileName;
    public JdbcTemplate getJdbcTemplate() {
        return jdbcTemplate;
    }

    public void setJdbcTemplate(JdbcTemplate jdbcTemplate) {
        this.jdbcTemplate = jdbcTemplate;
    }

    JdbcTemplate jdbcTemplate;

    public Integer getDataId() {
        return dataId;
    }

    public void setDataId(Integer dataId) {
        this.dataId = dataId;
    }

    private Integer dataId;
    public CnkiDetailRepoPageProcessor(Integer dataId,JdbcTemplate jdbcTemplate,String filename){
        this.dataId=dataId;
        this.jdbcTemplate=jdbcTemplate;
        this.fileName=filename;
    }

    @Override
    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑
    public void process(Page page) {
        try {
            //解析页面 获取详情的url地址
            //List<Selectable> list = page.getHtml().xpath("/html/body/table/tbody/tr[2]/td/div/p/text()").nodes();
            //List<Selectable> list = page.getHtml().xpath("/html/body/table/tbody/tr").nodes();
            List<Selectable> list = page.getHtml().xpath("//div[@class='contentTextCrfd']").nodes();
            if (list.size()> 0) {
                //System.out.println(list.get(0).get());
                List<String> htmlContents=list.stream().map(n->n.get()).collect(Collectors.toList());
                String content=getText(String.join("",htmlContents));
                if(StringUtil.isNotBlank(content)){
                    String insertSql = String.format("insert into ctwhdetail (dataid,filename,content) values (%s,'%s','%s') ", dataId,fileName,content);
                    jdbcTemplate.execute(insertSql);
                }

            }
        }catch (Exception ex){
            System.out.println(String.format("条目[%s]细览插入报错"));
            String insertSql = String.format("insert into ctwhspliderinfo (dataid,filename,status,errordetail) values (%s,'%s','%s','%s') ", dataId,fileName,0,ex.getMessage());
            System.out.println(insertSql);
            jdbcTemplate.execute(insertSql);
        }
    }

    @Override
    public Site getSite() {
        return site;
    }
    public  String getFileName(String url,String paramName) {
        if (url == null) {
            return "";
        }
        url = url.trim();
        if (url.equals("")) {
            return "";
        }
        String[] urlParts = url.split("\\?");
        String baseurl = urlParts[0];
        //没有参数
        if (urlParts.length == 1) {
            return "";
        }
        //有参数
        String[] params = urlParts[1].split("&");
        HashMap<String,String> paramMp = new HashMap<>();
        for (String p : params) {
            String[] keyValue = p.split("=");
            if(keyValue[0].equals(paramName)){
                return keyValue[1];
            }
        }

        return "";
    }
    public String getText(String content) {
        String txtcontent = content;
        txtcontent=txtcontent.replaceAll("\r\n","");
        txtcontent=txtcontent.replaceAll("<br>","");
        txtcontent=txtcontent.replaceAll("<small>","").replaceAll("</small>","");
        txtcontent=txtcontent.replaceAll("<sm>","").replaceAll("</sm>","");
        txtcontent=txtcontent.replaceAll("</?[^>]+>", ""); //剔出<html>的标签  
        txtcontent = txtcontent.replaceAll("<a>\\s*|\t|\r|\n|</a>", "");//去除字符串中的空格,回车,换行符,制表符
        txtcontent=txtcontent.replaceAll("'","\\’");
        txtcontent=txtcontent.replaceAll("/<(w+)[^>]+>.*?</1>/g","");
        txtcontent=txtcontent.replaceAll("&#.+?;", "");
        txtcontent = txtcontent.replaceAll("\\s/g","");
        return txtcontent;
    }
}
